home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.4)
-
- from __future__ import generators
- import math
-
- try:
- Set = set
- except NameError:
-
- try:
- from sets import Set
- except ImportError:
- from spambayes.compatsets import Set
-
-
- import re
- import os
- import sys
- import socket
- import pickle
- import urllib2
- from email import message_from_string
-
- try:
- enumerate
- except NameError:
-
- def enumerate(seq):
- i = 0
- for elt in seq:
- yield (i, elt)
- i += 1
-
-
-
- DOMAIN_AND_PORT_RE = re.compile('([^:/\\\\]+)(:([\\d]+))?')
- HTTP_ERROR_RE = re.compile('HTTP Error ([\\d]+)')
- URL_KEY_RE = re.compile('[\\W]')
- from spambayes.Options import options
- from spambayes.chi2 import chi2Q
-
- try:
- (True, False)
- except NameError:
- (True, False) = (1, 0)
-
- LN2 = math.log(2)
- slurp_wordstream = None
- PICKLE_VERSION = 5
-
- class WordInfo(object):
- __slots__ = ('spamcount', 'hamcount')
-
- def __init__(self):
- self.__setstate__((0, 0))
-
-
- def __repr__(self):
- return 'WordInfo' + repr((self.spamcount, self.hamcount))
-
-
- def __getstate__(self):
- return (self.spamcount, self.hamcount)
-
-
- def __setstate__(self, t):
- (self.spamcount, self.hamcount) = t
-
-
-
- class Classifier:
- WordInfoClass = WordInfo
-
- def __init__(self):
- self.wordinfo = { }
- self.probcache = { }
- self.nspam = self.nham = 0
-
-
- def __getstate__(self):
- return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
-
-
- def __setstate__(self, t):
- if t[0] != PICKLE_VERSION:
- raise ValueError("Can't unpickle -- version %s unknown" % t[0])
-
- (self.wordinfo, self.nspam, self.nham) = t[1:]
- self.probcache = { }
-
-
- def chi2_spamprob(self, wordstream, evidence = False):
- '''Return best-guess probability that wordstream is spam.
-
- wordstream is an iterable object producing words.
- The return value is a float in [0.0, 1.0].
-
- If optional arg evidence is True, the return value is a pair
- probability, evidence
- where evidence is a list of (word, probability) pairs.
- '''
- frexp = frexp
- ln = log
- import math
- H = S = 1.0
- Hexp = Sexp = 0
- clues = self._getclues(wordstream)
- for prob, word, record in clues:
- S *= 1.0 - prob
- H *= prob
- if S < 9.9999999999999998e-201:
- (S, e) = frexp(S)
- Sexp += e
-
- if H < 9.9999999999999998e-201:
- (H, e) = frexp(H)
- Hexp += e
- continue
-
- S = ln(S) + Sexp * LN2
- H = ln(H) + Hexp * LN2
- n = len(clues)
- if n:
- S = 1.0 - chi2Q(-2.0 * S, 2 * n)
- H = 1.0 - chi2Q(-2.0 * H, 2 * n)
- prob = ((S - H) + 1.0) / 2.0
- else:
- prob = 0.5
-
-
- def slurping_spamprob(self, wordstream, evidence = False):
- '''Do the standard chi-squared spamprob, but if the evidence
- leaves the score in the unsure range, and we have fewer tokens
- than max_discriminators, also generate tokens from the text
- obtained by following http URLs in the message.'''
- h_cut = options[('Categorization', 'ham_cutoff')]
- s_cut = options[('Categorization', 'spam_cutoff')]
- (prob, clues) = self.chi2_spamprob(wordstream, True)
- if evidence:
- return (prob, clues)
-
- return prob
-
- if options[('Classifier', 'use_chi_squared_combining')]:
- if options[('URLRetriever', 'x-slurp_urls')]:
- spamprob = slurping_spamprob
- else:
- spamprob = chi2_spamprob
-
-
- def learn(self, wordstream, is_spam):
- """Teach the classifier by example.
-
- wordstream is a word stream representing a message. If is_spam is
- True, you're telling the classifier this message is definitely spam,
- else that it's definitely not spam.
- """
- if options[('Classifier', 'use_bigrams')]:
- wordstream = self._enhance_wordstream(wordstream)
-
- if options[('URLRetriever', 'x-slurp_urls')]:
- wordstream = self._add_slurped(wordstream)
-
- self._add_msg(wordstream, is_spam)
-
-
- def unlearn(self, wordstream, is_spam):
- '''In case of pilot error, call unlearn ASAP after screwing up.
-
- Pass the same arguments you passed to learn().
- '''
- if options[('Classifier', 'use_bigrams')]:
- wordstream = self._enhance_wordstream(wordstream)
-
- if options[('URLRetriever', 'x-slurp_urls')]:
- wordstream = self._add_slurped(wordstream)
-
- self._remove_msg(wordstream, is_spam)
-
-
- def probability(self, record):
- '''Compute, store, and return prob(msg is spam | msg contains word).
-
- This is the Graham calculation, but stripped of biases, and
- stripped of clamping into 0.01 thru 0.99. The Bayesian
- adjustment following keeps them in a sane range, and one
- that naturally grows the more evidence there is to back up
- a probability.
- '''
- spamcount = record.spamcount
- hamcount = record.hamcount
-
- try:
- return self.probcache[spamcount][hamcount]
- except KeyError:
- pass
-
- if not self.nham:
- pass
- nham = float(1)
- if not self.nspam:
- pass
- nspam = float(1)
- if not hamcount <= nham:
- raise AssertionError, 'Token seen in more ham than ham trained.'
- hamratio = hamcount / nham
- if not spamcount <= nspam:
- raise AssertionError, 'Token seen in more spam than spam trained.'
- spamratio = spamcount / nspam
- prob = spamratio / (hamratio + spamratio)
- S = options[('Classifier', 'unknown_word_strength')]
- StimesX = S * options[('Classifier', 'unknown_word_prob')]
- n = hamcount + spamcount
- prob = (StimesX + n * prob) / (S + n)
-
- try:
- self.probcache[spamcount][hamcount] = prob
- except KeyError:
- self.probcache[spamcount] = {
- hamcount: prob }
-
- return prob
-
-
- def _add_msg(self, wordstream, is_spam):
- self.probcache = { }
- for word in Set(wordstream):
- record = self._wordinfoget(word)
- if record is None:
- record = self.WordInfoClass()
-
- self._wordinfoset(word, record)
-
- self._post_training()
-
-
- def _remove_msg(self, wordstream, is_spam):
- self.probcache = { }
- if is_spam:
- if self.nspam <= 0:
- raise ValueError('spam count would go negative!')
-
- self.nspam -= 1
- elif self.nham <= 0:
- raise ValueError('non-spam count would go negative!')
-
- self.nham -= 1
- for word in Set(wordstream):
- record = self._wordinfoget(word)
- if record is not None:
- if is_spam:
- if record.spamcount > 0:
- record.spamcount -= 1
-
- elif record.hamcount > 0:
- record.hamcount -= 1
-
- if 0 == 0:
- pass
- elif 0 == record.spamcount:
- self._wordinfodel(word)
- else:
- self._wordinfoset(word, record)
- 0 == record.spamcount
-
- self._post_training()
-
-
- def _post_training(self):
- '''This is called after training on a wordstream. Subclasses might
- want to ensure that their databases are in a consistent state at
- this point. Introduced to fix bug #797890.'''
- pass
-
-
- def _getclues(self, wordstream):
- mindist = options[('Classifier', 'minimum_prob_strength')]
- if options[('Classifier', 'use_bigrams')]:
- raw = []
- push = raw.append
- pair = None
- seen = {
- pair: 1 }
- for i, token in enumerate(wordstream):
- if i:
- pair = 'bi:%s %s' % (last_token, token)
-
- last_token = token
- for clue, indices in ((token, (i,)), (pair, (i - 1, i))):
- if clue not in seen:
- seen[clue] = 1
- tup = self._worddistanceget(clue)
- if tup[0] >= mindist:
- push((tup, indices))
-
- tup[0] >= mindist
-
-
- raw.sort()
- raw.reverse()
- clues = []
- push = clues.append
- seen = { }
- for tup, indices in raw:
- overlap = _[1]
- if not overlap:
- for i in indices:
- seen[i] = 1
-
- push(tup)
- continue
- []
-
- clues.reverse()
- else:
- clues = []
- push = clues.append
- for word in Set(wordstream):
- tup = self._worddistanceget(word)
- if tup[0] >= mindist:
- push(tup)
- continue
-
- clues.sort()
- if len(clues) > options[('Classifier', 'max_discriminators')]:
- del clues[0:-options[('Classifier', 'max_discriminators')]]
-
- return [ t[1:] for t in clues ]
-
-
- def _worddistanceget(self, word):
- record = self._wordinfoget(word)
- if record is None:
- prob = options[('Classifier', 'unknown_word_prob')]
- else:
- prob = self.probability(record)
- distance = abs(prob - 0.5)
- return (distance, prob, word, record)
-
-
- def _wordinfoget(self, word):
- return self.wordinfo.get(word)
-
-
- def _wordinfoset(self, word, record):
- self.wordinfo[word] = record
-
-
- def _wordinfodel(self, word):
- del self.wordinfo[word]
-
-
- def _enhance_wordstream(self, wordstream):
- '''Add bigrams to the wordstream.
-
- For example, a b c -> a b "a b" c "b c"
-
- Note that these are *token* bigrams, and not *word* bigrams - i.e.
- \'synthetic\' tokens get bigram\'ed, too.
-
- The bigram token is simply "bi:unigram1 unigram2" - a space should
- be sufficient as a separator, since spaces aren\'t in any other
- tokens, apart from \'synthetic\' ones. The "bi:" prefix is added
- to avoid conflict with tokens we generate (like "subject: word",
- which could be "word" in a subject, or a bigram of "subject:" and
- "word").
-
- If the "Classifier":"use_bigrams" option is removed, this function
- can be removed, too.
- '''
- last = None
- for token in wordstream:
- yield token
- if last:
- yield 'bi:%s %s' % (last, token)
-
- last = token
-
-
-
- def _generate_slurp(self):
- if not hasattr(self, 'setup_done'):
- self.setup()
- self.setup_done = True
-
- if not hasattr(self, 'do_slurp') or self.do_slurp:
- if slurp_wordstream:
- self.do_slurp = False
- tokens = self.slurp(*slurp_wordstream)
- self.do_slurp = True
- self._save_caches()
- return tokens
-
-
- return []
-
-
- def setup(self):
- ExpiryFileCorpus = ExpiryFileCorpus
- FileMessageFactory = FileMessageFactory
- import spambayes.FileCorpus
- username = options[('globals', 'proxy_username')]
- password = options[('globals', 'proxy_password')]
- server = options[('globals', 'proxy_server')]
- if server.find(':') != -1:
- (server, port) = server.split(':', 1)
- else:
- port = 8080
- if server:
- proxy_support = urllib2.ProxyHandler({
- 'http': 'http://%s:%s@%s:%d' % (username, password, server, port) })
- opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
- else:
- opener = urllib2.build_opener(urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- age = options[('URLRetriever', 'x-cache_expiry_days')] * 24 * 60 * 60
- dir = options[('URLRetriever', 'x-cache_directory')]
- if not os.path.exists(dir):
- if options[('globals', 'verbose')]:
- print >>sys.stderr, 'Creating URL cache directory'
-
- os.makedirs(dir)
-
- self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize = 20)
- self.urlCorpus.removeExpiredMessages()
- self.bad_url_cache_name = os.path.join(dir, 'bad_urls.pck')
- self.http_error_cache_name = os.path.join(dir, 'http_error_urls.pck')
- if os.path.exists(self.bad_url_cache_name):
- b_file = file(self.bad_url_cache_name, 'r')
-
- try:
- self.bad_urls = pickle.load(b_file)
- except IOError:
- ValueError = None
- if options[('globals', 'verbose')]:
- print >>sys.stderr, 'Bad URL pickle, using new.'
-
- self.bad_urls = {
- 'url:non_resolving': (),
- 'url:non_html': (),
- 'url:unknown_error': () }
-
- b_file.close()
- elif options[('globals', 'verbose')]:
- print "URL caches don't exist: creating"
-
- self.bad_urls = {
- 'url:non_resolving': (),
- 'url:non_html': (),
- 'url:unknown_error': () }
- if os.path.exists(self.http_error_cache_name):
- h_file = file(self.http_error_cache_name, 'r')
-
- try:
- self.http_error_urls = pickle.load(h_file)
- except IOError:
- ValueError = None
- if options[('globals', 'verbose')]:
- print >>sys.stderr, 'Bad HHTP error pickle, using new.'
-
- self.http_error_urls = { }
-
- h_file.close()
- else:
- self.http_error_urls = { }
-
-
- def _save_caches(self):
- for name, data in [
- (self.bad_url_cache_name, self.bad_urls),
- (self.http_error_cache_name, self.http_error_urls)]:
- cache = open(name + '.tmp', 'w')
- pickle.dump(data, cache)
- cache.close()
-
- try:
- os.rename(name + '.tmp', name)
- continue
- except OSError:
- os.remove(name)
- os.rename(name + '.tmp', name)
- continue
-
-
-
-
-
- def slurp(self, proto, url):
- if not url:
- return [
- 'url:non_resolving']
-
- Tokenizer = Tokenizer
- import spambayes.tokenizer
- if options[('URLRetriever', 'x-only_slurp_base')]:
- url = self._base_url(url)
-
- for err in self.bad_urls.keys():
- if url in self.bad_urls[err]:
- return [
- err]
- continue
-
- if self.http_error_urls.has_key(url):
- return self.http_error_urls[url]
-
- mo = DOMAIN_AND_PORT_RE.match(url)
- domain = mo.group(1)
- if mo.group(3) is None:
- port = 80
- else:
- port = mo.group(3)
-
- try:
- not_used = socket.getaddrinfo(domain, port)
- except socket.error:
- self.bad_urls['url:non_resolving'] += (url,)
- return [
- 'url:non_resolving']
-
- url_key = URL_KEY_RE.sub('_', url)
- cached_message = self.urlCorpus.get(url_key)
- if cached_message is None:
- parts = url.split('.')
- if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
- self.bad_urls['url:non_html'] += (url,)
- return [
- 'url:non_html']
-
-
- try:
- timeout = socket.getdefaulttimeout()
- socket.setdefaulttimeout(5)
- except AttributeError:
- pass
-
-
- try:
- if options[('globals', 'verbose')]:
- print >>sys.stderr, 'Slurping', url
-
- f = urllib2.urlopen('%s://%s' % (proto, url))
- except (urllib2.URLError, socket.error):
- details = None
- mo = HTTP_ERROR_RE.match(str(details))
- if mo:
- self.http_error_urls[url] = 'url:http_' + mo.group(1)
- return [
- 'url:http_' + mo.group(1)]
-
- self.bad_urls['url:unknown_error'] += (url,)
- return [
- 'url:unknown_error']
-
-
- try:
- socket.setdefaulttimeout(timeout)
- except AttributeError:
- pass
-
-
- try:
- content_type = f.info().get('content-type')
- if content_type is None or not content_type.startswith('text/html'):
- self.bad_urls['url:non_html'] += (url,)
- return [
- 'url:non_html']
-
- page = f.read()
- headers = str(f.info())
- f.close()
- except socket.error:
- return []
-
- fake_message_string = headers + '\r\n' + page
- message = self.urlCorpus.makeMessage(url_key, fake_message_string)
- self.urlCorpus.addMessage(message)
- else:
- fake_message_string = cached_message.as_string()
- msg = message_from_string(fake_message_string)
- bht = options[('Tokenizer', 'basic_header_tokenize')]
- bhto = options[('Tokenizer', 'basic_header_tokenize_only')]
- options[('Tokenizer', 'basic_header_tokenize')] = True
- options[('Tokenizer', 'basic_header_tokenize_only')] = True
- tokens = Tokenizer().tokenize(msg)
- pf = options[('URLRetriever', 'x-web_prefix')]
- tokens = [ '%s%s' % (pf, tok) for tok in tokens ]
- options[('Tokenizer', 'basic_header_tokenize')] = bht
- options[('Tokenizer', 'basic_header_tokenize_only')] = bhto
- return tokens
-
-
- def _base_url(self, url):
- url += '/'
- (domain, garbage) = url.split('/', 1)
- parts = domain.split('.')
- if len(parts) > 2:
- base_domain = parts[-2] + '.' + parts[-1]
- if len(parts[-1]) < 3:
- base_domain = parts[-3] + '.' + base_domain
-
- else:
- base_domain = domain
- return base_domain
-
-
- def _add_slurped(self, wordstream):
- """Add tokens generated by 'slurping' (i.e. tokenizing
- the text at the web pages pointed to by URLs in messages)
- to the wordstream."""
- for token in wordstream:
- yield token
-
- slurped_tokens = self._generate_slurp()
- for token in slurped_tokens:
- yield token
-
-
-
- def _wordinfokeys(self):
- return self.wordinfo.keys()
-
-
- Bayes = Classifier
-